{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ab4104e5-4910-45cd-acbb-2e4df2f3d886",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import tqdm\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.naive_bayes import GaussianNB, MultinomialNB\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import accuracy_score,f1_score\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.svm import SVC\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9a370557-3e33-4b53-b3ae-0cc235edb67f",
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_name = '../../data/'\n",
    "\n",
    "feature_column = \"headline\"\n",
    "label_column = \"category\"\n",
    "\n",
    "np.random.seed(42)\n",
    "\n",
    "language = ['amh','eng','fra','hau','ibo','lin','pcm','run','swa','yor'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f5f075f9-7105-4b0c-98a4-891800e44dbc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-------------------------------------------------\n",
      "--------------Working on amh-----------------\n",
      " Training set size : 5244   Dev set size: 752\n"
     ]
    }
   ],
   "source": [
    "print('-------------------------------------------------')\n",
    "print(f'--------------Working on {language}-----------------')\n",
    "\n",
    "train_data = pd.read_csv(f'{folder_name}/{language}/train.tsv',sep='\\t')\n",
    "dev_data = pd.read_csv(f'{folder_name}/{language}/dev.tsv',sep='\\t')\n",
    "test_data = pd.read_csv(f'{folder_name}/{language}/dev.tsv',sep='\\t')\n",
    "\n",
    "print(f' Training set size : {train_data.size}   Dev set size: {dev_data.size}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e2904250-348e-4a34-a6ab-e33b2cbb3b04",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] Sample data \n",
      " ['የስፖርት ኮከቦች እና የንግድ ምልክቶቻቸው- ከቦልት እስከ ክርስቲያኖ ሮናልዶ', 'እግር ኳስ፡ ዩናይትድ፣ አርሴናል፣ ቼልሲ . . . ምን አስበዋል?', 'ዓለምን ካስጨነቃት የዋጋ ንረት ተጠቃሚዎቹ እነማን ናቸው?']\n",
      "[INFO] Found Labels :  ['sports', 'business', 'health', 'politics']\n"
     ]
    }
   ],
   "source": [
    "all_text_list  = train_data[feature_column].values.tolist()+dev_data[feature_column].values.tolist() \n",
    "    \n",
    "print('[INFO] Sample data \\n',all_text_list[:3])\n",
    "\n",
    "train_text,train_label = train_data[feature_column].values.tolist(),train_data[label_column].values.tolist()\n",
    "dev_text,dev_label = dev_data[feature_column].values.tolist(),dev_data[label_column].values.tolist()\n",
    "test_text,test_label = test_data[feature_column].values.tolist(),test_data[label_column].values.tolist()\n",
    "\n",
    "\n",
    "unique_label = train_data[label_column].unique().tolist()\n",
    "\n",
    "print('[INFO] Found Labels : ',unique_label)\n",
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3f105348-e7c7-4587-9c84-5c84f77ca2e1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sizes : ((1311, 19739), (188, 19739), (188, 19739), 1311, 188, 188)\n"
     ]
    }
   ],
   "source": [
    "vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(1, 3))\n",
    "vectorizer.fit_transform(all_text_list)\n",
    "\n",
    "# # TfidfVectorizer\n",
    "# vectorizer = TfidfVectorizer(analyzer='char_wb',ngram_range=(1, 3))\n",
    "# vectorizer.fit_transform(all_text_list)\n",
    "\n",
    "X_train = vectorizer.transform(train_text).toarray()\n",
    "X_dev= vectorizer.transform(dev_text).toarray()\n",
    "X_test= vectorizer.transform(test_text).toarray()\n",
    "\n",
    "y_train = []\n",
    "for i in train_label:\n",
    "    y_train.append(unique_label.index(i))\n",
    "\n",
    "y_dev = []\n",
    "for i in dev_label:\n",
    "    y_dev.append(unique_label.index(i))\n",
    "\n",
    "y_test = []\n",
    "for i in test_label:\n",
    "    y_test.append(unique_label.index(i))\n",
    "\n",
    "print(f'Sizes : {X_train.shape,X_dev.shape,X_test.shape,len(y_train),len(y_dev),len(y_test)}')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c000871b-3e8d-4ec7-b783-6352f208d87f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=======   GaussianNB   =========\n",
      "acc: 0.776595744680851     |  f1_score: 0.7611702347299674\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      sports       0.80      0.87      0.84        47\n",
      "    business       0.77      0.49      0.60        41\n",
      "      health       0.76      0.88      0.81        50\n",
      "    politics       0.77      0.82      0.80        50\n",
      "\n",
      "    accuracy                           0.78       188\n",
      "   macro avg       0.78      0.77      0.76       188\n",
      "weighted avg       0.78      0.78      0.77       188\n",
      "\n",
      "f1 = 0.7678229177446947\n",
      "loss = None\n",
      "precision = 0.7762396544134534\n",
      "recall = 0.776595744680851\n"
     ]
    }
   ],
   "source": [
    "print('=======   GaussianNB   =========')\n",
    "\n",
    "classifier = GaussianNB()\n",
    "classifier.fit(X_train, y_train)\n",
    "\n",
    "# Predict Class\n",
    "y_pred = classifier.predict(X_dev)\n",
    "\n",
    "# Accuracy \n",
    "accuracy = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred, average='macro')\n",
    "\n",
    "\n",
    "print(f'acc: {accuracy}     |  f1_score: {f1}')\n",
    "print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))\n",
    "\n",
    "\n",
    "if not os.path.exists(f\"{language}/GaussianNB\"):\n",
    "    os.makedirs(f\"{language}/GaussianNB\")\n",
    "\n",
    "acc = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred,average='weighted')\n",
    "precision = metrics.precision_score(y_dev, y_pred,average='weighted')\n",
    "recall = metrics.recall_score(y_dev, y_pred,average='weighted')\n",
    "\n",
    "print(f\"f1 = {f1}\")\n",
    "print(f\"loss = {None}\")\n",
    "print(f\"precision = {precision}\")\n",
    "print(f\"recall = {recall}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a59813ff-bbb3-43a4-8a6d-148724301f41",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=======   MultinomialNB   =========\n",
      "acc: 0.8191489361702128     |  f1_score: 0.8130844477055927\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      sports       0.95      0.79      0.86        47\n",
      "    business       0.81      0.63      0.71        41\n",
      "      health       0.73      0.92      0.81        50\n",
      "    politics       0.83      0.90      0.87        50\n",
      "\n",
      "    accuracy                           0.82       188\n",
      "   macro avg       0.83      0.81      0.81       188\n",
      "weighted avg       0.83      0.82      0.82       188\n",
      "\n",
      "f1 = 0.8171517834477221\n",
      "loss = None\n",
      "precision = 0.8301959934273765\n",
      "recall = 0.8191489361702128\n"
     ]
    }
   ],
   "source": [
    "print('=======   MultinomialNB   =========')\n",
    "\n",
    "classifier = MultinomialNB()\n",
    "classifier.fit(X_train, y_train)\n",
    "\n",
    "# Predict Class\n",
    "y_pred = classifier.predict(X_dev)\n",
    "\n",
    "# Accuracy \n",
    "accuracy = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred, average='macro')\n",
    "\n",
    "\n",
    "print(f'acc: {accuracy}     |  f1_score: {f1}')\n",
    "print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))\n",
    "\n",
    "if not os.path.exists(f\"{language}/MultinomialNB\"):\n",
    "    os.makedirs(f\"{language}/MultinomialNB\")\n",
    "\n",
    "acc = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred,average='weighted')\n",
    "precision = metrics.precision_score(y_dev, y_pred,average='weighted')\n",
    "recall = metrics.recall_score(y_dev, y_pred,average='weighted')\n",
    "\n",
    "print(f\"f1 = {f1}\")\n",
    "print(f\"loss = {None}\")\n",
    "print(f\"precision = {precision}\")\n",
    "print(f\"recall = {recall}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3e63585f-688b-456b-9d00-18174c6e211f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=======   KNeighborsClassifier   =========\n",
      "acc: 0.6276595744680851     |  f1_score: 0.6181197127187802\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      sports       0.57      0.79      0.66        47\n",
      "    business       0.50      0.44      0.47        41\n",
      "      health       0.69      0.70      0.69        50\n",
      "    politics       0.78      0.56      0.65        50\n",
      "\n",
      "    accuracy                           0.63       188\n",
      "   macro avg       0.63      0.62      0.62       188\n",
      "weighted avg       0.64      0.63      0.62       188\n",
      "\n",
      "f1 = 0.6246489759511754\n",
      "loss = None\n",
      "precision = 0.6407258538985697\n",
      "recall = 0.6276595744680851\n"
     ]
    }
   ],
   "source": [
    "print('=======   KNeighborsClassifier   =========')\n",
    "\n",
    "classifier = KNeighborsClassifier(n_neighbors=3)\n",
    "classifier.fit(X_train, y_train)\n",
    "\n",
    "# Predict Class\n",
    "y_pred = classifier.predict(X_dev)\n",
    "\n",
    "# Accuracy \n",
    "accuracy = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred, average='macro')\n",
    "\n",
    "\n",
    "print(f'acc: {accuracy}     |  f1_score: {f1}')\n",
    "print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))\n",
    "\n",
    "if not os.path.exists(f\"{language}/KNeighborsClassifier\"):\n",
    "    os.makedirs(f\"{language}/KNeighborsClassifier\")\n",
    "\n",
    "acc = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred,average='weighted')\n",
    "precision = metrics.precision_score(y_dev, y_pred,average='weighted')\n",
    "recall = metrics.recall_score(y_dev, y_pred,average='weighted')\n",
    "\n",
    "print(f\"f1 = {f1}\")\n",
    "print(f\"loss = {None}\")\n",
    "print(f\"precision = {precision}\")\n",
    "print(f\"recall = {recall}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b94d934d-afc6-4eb5-8968-f0246457cb0d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=======   MLPClassifier   =========\n",
      "acc: 0.8138297872340425     |  f1_score: 0.805532853873815\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      sports       0.91      0.87      0.89        47\n",
      "    business       0.77      0.59      0.67        41\n",
      "      health       0.72      0.92      0.81        50\n",
      "    politics       0.88      0.84      0.86        50\n",
      "\n",
      "    accuracy                           0.81       188\n",
      "   macro avg       0.82      0.80      0.81       188\n",
      "weighted avg       0.82      0.81      0.81       188\n",
      "\n",
      "f1 = 0.8108120092089613\n",
      "loss = None\n",
      "precision = 0.8204875409898574\n",
      "recall = 0.8138297872340425\n"
     ]
    }
   ],
   "source": [
    "print('=======   MLPClassifier   =========')\n",
    "\n",
    "classifier = MLPClassifier(random_state=1, max_iter=300)\n",
    "classifier.fit(X_train, y_train)\n",
    "\n",
    "# Predict Class\n",
    "y_pred = classifier.predict(X_dev)\n",
    "\n",
    "# Accuracy \n",
    "accuracy = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred, average='macro')\n",
    "\n",
    "\n",
    "print(f'acc: {accuracy}     |  f1_score: {f1}')\n",
    "print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))\n",
    "\n",
    "if not os.path.exists(f\"{language}/MLPClassifier\"):\n",
    "    os.makedirs(f\"{language}/MLPClassifier\")\n",
    "\n",
    "acc = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred,average='weighted')\n",
    "precision = metrics.precision_score(y_dev, y_pred,average='weighted')\n",
    "recall = metrics.recall_score(y_dev, y_pred,average='weighted')\n",
    "\n",
    "print(f\"f1 = {f1}\")\n",
    "print(f\"loss = {None}\")\n",
    "print(f\"precision = {precision}\")\n",
    "print(f\"recall = {recall}\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a4088004-a70e-41f2-9697-a6a477f217e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=======   XGBClassifier   =========\n",
      "acc: 0.8138297872340425     |  f1_score: 0.8109039309503706\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      sports       0.93      0.81      0.86        47\n",
      "    business       0.80      0.68      0.74        41\n",
      "      health       0.73      0.88      0.80        50\n",
      "    politics       0.83      0.86      0.84        50\n",
      "\n",
      "    accuracy                           0.81       188\n",
      "   macro avg       0.82      0.81      0.81       188\n",
      "weighted avg       0.82      0.81      0.81       188\n",
      "\n",
      "f1 = 0.8136079688925352\n",
      "loss = None\n",
      "precision = 0.8211372134179606\n",
      "recall = 0.8138297872340425\n"
     ]
    }
   ],
   "source": [
    "print('=======   XGBClassifier   =========')\n",
    "\n",
    "classifier = XGBClassifier()\n",
    "classifier.fit(X_train, y_train)\n",
    "\n",
    "# Predict Class\n",
    "y_pred = classifier.predict(X_dev)\n",
    "\n",
    "# Accuracy \n",
    "accuracy = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred, average='macro')\n",
    "\n",
    "\n",
    "print(f'acc: {accuracy}     |  f1_score: {f1}')\n",
    "print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))\n",
    "\n",
    "if not os.path.exists(f\"{language}/XGBClassifier\"):\n",
    "    os.makedirs(f\"{language}/XGBClassifier\")\n",
    "\n",
    "acc = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred,average='weighted')\n",
    "precision = metrics.precision_score(y_dev, y_pred,average='weighted')\n",
    "recall = metrics.recall_score(y_dev, y_pred,average='weighted')\n",
    "\n",
    "print(f\"f1 = {f1}\")\n",
    "print(f\"loss = {None}\")\n",
    "print(f\"precision = {precision}\")\n",
    "print(f\"recall = {recall}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f3e585f2-d21f-4024-9890-4a9cbfb384fb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=======   SVC   =========\n",
      "acc: 0.3882978723404255     |  f1_score: 0.3882978723404255\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      sports       0.00      0.00      0.00        47\n",
      "    business       0.00      0.00      0.00        41\n",
      "      health       0.68      0.46      0.55        50\n",
      "    politics       0.32      1.00      0.49        50\n",
      "\n",
      "    accuracy                           0.39       188\n",
      "   macro avg       0.25      0.36      0.26       188\n",
      "weighted avg       0.27      0.39      0.28       188\n",
      "\n",
      "f1 = 0.2760146611836224\n",
      "loss = None\n",
      "precision = 0.26626221088048374\n",
      "recall = 0.3882978723404255\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/azime/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n",
      "/home/azime/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n",
      "/home/azime/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n",
      "/home/azime/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    }
   ],
   "source": [
    "print('=======   SVC   =========')\n",
    "classifier = SVC(gamma='auto')\n",
    "classifier.fit(X_train, y_train)\n",
    "# Predict Class\n",
    "y_pred = classifier.predict(X_dev)\n",
    "\n",
    "# Accuracy \n",
    "accuracy = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred, average='micro')\n",
    "\n",
    "\n",
    "print(f'acc: {accuracy}     |  f1_score: {f1}')\n",
    "print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))\n",
    "\n",
    "if not os.path.exists(f\"{language}/SVC\"):\n",
    "    os.makedirs(f\"{language}/SVC\")\n",
    "\n",
    "acc = metrics.accuracy_score(y_dev, y_pred)\n",
    "f1 = metrics.f1_score(y_dev, y_pred,average='weighted')\n",
    "precision = metrics.precision_score(y_dev, y_pred,average='weighted')\n",
    "recall = metrics.recall_score(y_dev, y_pred,average='weighted')\n",
    "\n",
    "print(f\"f1 = {f1}\")\n",
    "print(f\"loss = {None}\")\n",
    "print(f\"precision = {precision}\")\n",
    "print(f\"recall = {recall}\")\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d8bbaaf-fd46-474a-b69f-8ce069c2b18b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
